Alumno Código
Valeria Llactayo Peña 17160046
Joaquin Romualdo Peña 17150052
Javier Sullcaray Barrientos 17160200
Jose Victorio Gonzales 17160051
Daniel Yauri Leiva 17160208

En el presente trabajo se muestra el proceso de manipulación de datos de incendios forestales.

#Comenzaremos subiendo los datos que usaremos
load("prep_eda.RData")

ANÁLISIS EXPLORATORIO DE DATOS

# Librerias
library(tidyverse)
library(VIM)
library(skimr)
library(psych)
library(GGally)
library(visdat)
library(plotly)
library(viridis)
library(hrbrthemes)
# Dataset

fires
## # A tibble: 517 x 13
##        X     Y month day    FFMC   DMC    DC   ISI  temp    RH  wind  rain  area
##    <dbl> <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1     7     5 Mar   5      86.2  26.2  94.3   5.1   8.2    51   6.7   0       0
##  2     7     4 Oct   2      90.6  35.4 669.    6.7  18      33   0.9   0       0
##  3     7     4 Oct   6      90.6  43.7 687.    6.7  14.6    33   1.3   0       0
##  4     8     6 Mar   5      91.7  33.3  77.5   9     8.3    97   4     0.2     0
##  5     8     6 Mar   7      89.3  51.3 102.    9.6  11.4    99   1.8   0       0
##  6     8     6 Aug   7      92.3  85.3 488    14.7  22.2    29   5.4   0       0
##  7     8     6 Aug   1      92.3  88.9 496.    8.5  24.1    27   3.1   0       0
##  8     8     6 Aug   1      91.5 145.  608.   10.7   8      86   2.2   0       0
##  9     8     6 Sep   2      91   130.  693.    7    13.1    63   5.4   0       0
## 10     7     5 Sep   6      92.5  88   699.    7.1  22.8    40   4     0       0
## # ... with 507 more rows
# Resumenes estadísticos

summary(fires)
##        X               Y           month     day         FFMC      
##  Min.   :1.000   Min.   :2.0   Aug    :184   1:74   Min.   :18.70  
##  1st Qu.:3.000   1st Qu.:4.0   Sep    :172   2:64   1st Qu.:90.20  
##  Median :4.000   Median :4.0   Mar    : 54   3:54   Median :91.60  
##  Mean   :4.669   Mean   :4.3   Jul    : 32   4:61   Mean   :90.64  
##  3rd Qu.:7.000   3rd Qu.:5.0   Feb    : 20   5:85   3rd Qu.:92.90  
##  Max.   :9.000   Max.   :9.0   Jun    : 17   6:84   Max.   :96.20  
##                                (Other): 38   7:95                  
##       DMC              DC             ISI              temp      
##  Min.   :  1.1   Min.   :  7.9   Min.   : 0.000   Min.   : 2.20  
##  1st Qu.: 68.6   1st Qu.:437.7   1st Qu.: 6.500   1st Qu.:15.50  
##  Median :108.3   Median :664.2   Median : 8.400   Median :19.30  
##  Mean   :110.9   Mean   :547.9   Mean   : 9.022   Mean   :18.89  
##  3rd Qu.:142.4   3rd Qu.:713.9   3rd Qu.:10.800   3rd Qu.:22.80  
##  Max.   :291.3   Max.   :860.6   Max.   :56.100   Max.   :33.30  
##                                                                  
##        RH              wind            rain              area        
##  Min.   : 15.00   Min.   :0.400   Min.   :0.00000   Min.   :   0.00  
##  1st Qu.: 33.00   1st Qu.:2.700   1st Qu.:0.00000   1st Qu.:   0.00  
##  Median : 42.00   Median :4.000   Median :0.00000   Median :   0.52  
##  Mean   : 44.29   Mean   :4.018   Mean   :0.02166   Mean   :  12.85  
##  3rd Qu.: 53.00   3rd Qu.:4.900   3rd Qu.:0.00000   3rd Qu.:   6.57  
##  Max.   :100.00   Max.   :9.400   Max.   :6.40000   Max.   :1090.84  
## 
describe(fires)
##        vars   n   mean     sd median trimmed    mad  min     max   range  skew
## X         1 517   4.67   2.31   4.00    4.67   2.97  1.0    9.00    8.00  0.04
## Y         2 517   4.30   1.23   4.00    4.31   1.48  2.0    9.00    7.00  0.41
## month*    3 517   7.48   2.28   8.00    7.79   1.48  1.0   12.00   11.00 -1.21
## day*      4 517   4.26   2.07   5.00    4.32   2.97  1.0    7.00    6.00 -0.21
## FFMC      5 517  90.64   5.52  91.60   91.45   1.93 18.7   96.20   77.50 -6.54
## DMC       6 517 110.87  64.05 108.30  106.52  51.74  1.1  291.30  290.20  0.54
## DC        7 517 547.94 248.07 664.20  578.69 118.90  7.9  860.60  852.70 -1.09
## ISI       8 517   9.02   4.56   8.40    8.73   3.11  0.0   56.10   56.10  2.52
## temp      9 517  18.89   5.81  19.30   19.09   5.34  2.2   33.30   31.10 -0.33
## RH       10 517  44.29  16.32  42.00   42.71  14.83 15.0  100.00   85.00  0.86
## wind     11 517   4.02   1.79   4.00    3.90   1.93  0.4    9.40    9.00  0.57
## rain     12 517   0.02   0.30   0.00    0.00   0.00  0.0    6.40    6.40 19.70
## area     13 517  12.85  63.66   0.52    3.18   0.77  0.0 1090.84 1090.84 12.77
##        kurtosis    se
## X         -1.18  0.10
## Y          1.38  0.05
## month*     0.61  0.10
## day*      -1.29  0.09
## FFMC      66.14  0.24
## DMC        0.18  2.82
## DC        -0.27 10.91
## ISI       21.15  0.20
## temp       0.11  0.26
## RH         0.41  0.72
## wind       0.03  0.08
## rain     415.60  0.01
## area     191.50  2.80

Cantidad de Missing Values

## 
##  Variables sorted by number of missings: 
##  Variable Count
##         X     0
##         Y     0
##     month     0
##       day     0
##      FFMC     0
##       DMC     0
##        DC     0
##       ISI     0
##      temp     0
##        RH     0
##      wind     0
##      rain     0
##      area     0

Distribución espacial de incendios

**** Distribución de cantidad de incendios por mes **** ****

Pieplot de cantidad de incendios por mes


Distribución de variables por boxplot

# Distribución de variables por densidad

library(ggridges)

ggplot(fires_long2, aes(x = value, y = variable)) +
  geom_density_ridges() +
  facet_wrap(~month, scales = "free")

ggplot(fires_long2, aes(x = value, y = variable, fill = stat(quantile))) +
  stat_density_ridges(quantile_lines = FALSE,
                      calc_ecdf = TRUE,
                      geom = "density_ridges_gradient") +
  scale_fill_brewer(name = "") +
  facet_wrap(~month, scales = "free")

# Boxplot áreas por mes

ggplot(fires, aes(x = month, y = area)) +
  geom_boxplot(aes(fill = month, alpha = 0.5)) +
  geom_jitter(color="red", size=1.5, alpha=0.5, width = 0.1) +
  theme_ipsum_tw() +
  facet_wrap(~month, scales = "free") +
  labs(
    title = "Áreas quemadas por mes",
    subtitle = "Áreas en hectáreas",
    caption = "Elaboración propia"
  ) +
  theme_ipsum_tw(grid="XY", axis="xy") +
  theme(legend.position = "none")

# Boxplot áreas != 0.0 ha

fires_ars <-
  fires %>% 
  dplyr::filter(
    area >= 1
  ) %>% 
  dplyr::select(-rain)
summary(fires_ars)
##        X               Y             month    day         FFMC      
##  Min.   :1.000   Min.   :2.000   Sep    :90   1:36   Min.   :63.50  
##  1st Qu.:3.000   1st Qu.:4.000   Aug    :83   2:30   1st Qu.:90.35  
##  Median :5.000   Median :4.000   Mar    :18   3:26   Median :91.70  
##  Mean   :4.819   Mean   :4.379   Jul    :16   4:30   Mean   :90.98  
##  3rd Qu.:7.000   3rd Qu.:5.000   Feb    :10   5:39   3rd Qu.:92.90  
##  Max.   :9.000   Max.   :9.000   Dec    : 9   6:39   Max.   :96.20  
##                                  (Other):17   7:43                  
##       DMC              DC             ISI              temp      
##  Min.   :  3.2   Min.   : 15.3   Min.   : 0.800   Min.   : 2.20  
##  1st Qu.: 81.8   1st Qu.:470.6   1st Qu.: 6.800   1st Qu.:15.90  
##  Median :111.7   Median :665.6   Median : 8.400   Median :19.90  
##  Mean   :114.6   Mean   :569.2   Mean   : 9.044   Mean   :19.09  
##  3rd Qu.:141.2   3rd Qu.:724.1   3rd Qu.:10.900   3rd Qu.:23.35  
##  Max.   :291.3   Max.   :860.6   Max.   :21.300   Max.   :33.30  
##                                                                  
##        RH             wind            area        
##  Min.   :15.00   Min.   :0.400   Min.   :   1.01  
##  1st Qu.:33.00   1st Qu.:2.700   1st Qu.:   2.99  
##  Median :41.00   Median :4.000   Median :   7.19  
##  Mean   :43.72   Mean   :4.125   Mean   :  27.27  
##  3rd Qu.:53.00   3rd Qu.:4.900   3rd Qu.:  18.07  
##  Max.   :96.00   Max.   :9.400   Max.   :1090.84  
## 
describe(fires_ars)
##        vars   n   mean     sd median trimmed    mad   min     max   range  skew
## X         1 243   4.82   2.35   5.00    4.84   2.97  1.00    9.00    8.00 -0.03
## Y         2 243   4.38   1.14   4.00    4.37   1.48  2.00    9.00    7.00  0.47
## month*    3 243   7.74   2.19   8.00    8.06   1.48  2.00   12.00   10.00 -1.23
## day*      4 243   4.21   2.07   4.00    4.27   2.97  1.00    7.00    6.00 -0.18
## FFMC      5 243  90.98   3.81  91.70   91.53   1.78 63.50   96.20   32.70 -2.69
## DMC       6 243 114.64  63.56 111.70  110.75  43.88  3.20  291.30  288.10  0.54
## DC        7 243 569.16 236.21 665.60  603.22 118.16 15.30  860.60  845.30 -1.23
## ISI       8 243   9.04   4.13   8.40    8.78   2.97  0.80   21.30   20.50  0.67
## temp      9 243  19.09   6.35  19.90   19.43   5.49  2.20   33.30   31.10 -0.46
## RH       10 243  43.72  15.40  41.00   42.30  14.83 15.00   96.00   81.00  0.78
## wind     11 243   4.12   1.89   4.00    3.96   1.93  0.40    9.40    9.00  0.68
## area     12 243  27.27  90.81   7.19   11.45   7.43  1.01 1090.84 1089.83  8.89
##        kurtosis    se
## X         -1.20  0.15
## Y          1.51  0.07
## month*     1.30  0.14
## day*      -1.30  0.13
## FFMC      12.49  0.24
## DMC        0.38  4.08
## DC         0.23 15.15
## ISI        0.28  0.26
## temp       0.03  0.41
## RH         0.13  0.99
## wind       0.17  0.12
## area      91.13  5.83
ggplot(fires_ars, aes(x = month, y = area)) +
  geom_boxplot(aes(fill = month, alpha = 0.5)) +
  geom_jitter(color="red", size=1.5, alpha=0.5, width = 0.1) +
  theme_ipsum_tw() +
  facet_wrap(~month, scales = "free") +
  labs(
    title = "Áreas quemadas significativas por mes",
    subtitle = "Áreas mayores a 0 hectáreas",
    caption = "Elaboración propia"
  ) +
  theme_ipsum_tw(grid="XY", axis="xy") +
  theme(legend.position = "none")

library(ggridges)

ggplot(fires, aes(x = temp, y = month, fill = stat(x))) +
  geom_density_ridges_gradient(quantile_lines = TRUE, alpha = 0.75,
                               quantiles = c(0.05, 0.5, 0.95)) +
  scale_fill_viridis_c(name = "T (°C)", option = "C") +
  theme_ipsum() +
  labs(x = "Temperatura", y = "Mes")

# Data filtrada por el periodo de mayor ocurrencia de incendios

fires_per # Se consideran áreas de 0 ha
## # A tibble: 356 x 13
##        X     Y month  FFMC   DMC    DC   ISI  temp    RH  wind  rain  area grid 
##    <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
##  1     8     6 8      92.3  85.3  488   14.7  22.2    29   5.4     0     0 8-6  
##  2     8     6 8      92.3  88.9  496.   8.5  24.1    27   3.1     0     0 8-6  
##  3     8     6 8      91.5 145.   608.  10.7   8      86   2.2     0     0 8-6  
##  4     8     6 9      91   130.   693.   7    13.1    63   5.4     0     0 8-6  
##  5     7     5 9      92.5  88    699.   7.1  22.8    40   4       0     0 7-5  
##  6     7     5 9      92.5  88    699.   7.1  17.8    51   7.2     0     0 7-5  
##  7     7     5 9      92.8  73.2  713   22.6  19.3    38   4       0     0 7-5  
##  8     6     5 8      63.5  70.8  665.   0.8  17      72   6.7     0     0 6-5  
##  9     6     5 9      90.9 126.   686.   7    21.3    42   2.2     0     0 6-5  
## 10     6     5 9      92.9 133.   700.   9.2  26.4    21   4.5     0     0 6-5  
## # ... with 346 more rows
fires_pa <- 
  fires_per %>% 
  dplyr::filter(
    area > 0.00
  )

ggplot(fires_pa, aes(x = "Aug - Sep", y = area)) +
  geom_boxplot() +
  geom_jitter(color="red", size=1.5, alpha=0.5) +
  theme_ipsum_tw() +
  labs(
    title = "Áreas para el periodo Aug - Sep",
    subtitle = "Áreas mayores a 0 hectáreas",
    caption = "Elaboración propia"
  ) +
  theme(legend.position = "none")

fires_pap <- # Sin contar areas > 600 ha
  fires_pa %>% 
  dplyr::filter(
    area < 600
  )

Data para procesamiento

# Data para procesamiento

fires_pca
## # A tibble: 33 x 7
##     FFMC   DMC    DC   ISI  temp    RH   area
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1  91   130.   693.   7    18.8    40 213.  
##  2  91   276.   825.   7.1  21.9    43  70.8 
##  3  91.7 191.   636.   7.8  19.9    50  82.8 
##  4  93.5 149.   729.   8.1  27.8    27  95.2 
##  5  92.5 121.   674.   8.6  18.2    46 201.  
##  6  91.6 108.   764    6.2  18      51   0   
##  7  81.6  56.7  666.   1.9  21.9    71  54.3 
##  8  94.9 130.   587.  14.1  33.1    25  26.4 
##  9  92.2 102.   752.   8.4  24.2    27   6.58
## 10  93.3 141.   714.  13.9  18.6    49  35.9 
## # ... with 23 more rows

Matriz de correlación

  # Matriz de correlación

ggpairs(fires_area[,-c(1:3, 12)])

cor(fires_area[, -c(1:3, 12)])
##             FFMC         DMC          DC         ISI        temp         RH
## FFMC  1.00000000  0.32055252 -0.17227512  0.76054335  0.38085037 -0.4479377
## DMC   0.32055252  1.00000000  0.39014412  0.10424878  0.21615230 -0.2140209
## DC   -0.17227512  0.39014412  1.00000000 -0.21263329 -0.03560824 -0.3408187
## ISI   0.76054335  0.10424878 -0.21263329  1.00000000  0.47877095 -0.4167733
## temp  0.38085037  0.21615230 -0.03560824  0.47877095  1.00000000 -0.6477947
## RH   -0.44793766 -0.21402086 -0.34081874 -0.41677330 -0.64779472  1.0000000
## wind  0.03999076  0.04661713 -0.17986357  0.31983991  0.31833445  0.0263021
## area  0.09032888  0.08060299  0.01856502  0.04024941  0.18000670 -0.2748217
##             wind        area
## FFMC  0.03999076  0.09032888
## DMC   0.04661713  0.08060299
## DC   -0.17986357  0.01856502
## ISI   0.31983991  0.04024941
## temp  0.31833445  0.18000670
## RH    0.02630210 -0.27482173
## wind  1.00000000  0.08161989
## area  0.08161989  1.00000000
corrplot::corrplot(cor(fires_area[, -c(1:3, 12)]))

cor_fires <- cor(fires_area[-c(1:3, 12)])
cor.plot(cor_fires, order = "hclust")

PCA

#cargando los datos 

fires_pca
## # A tibble: 33 x 7
##     FFMC   DMC    DC   ISI  temp    RH   area
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1  91   130.   693.   7    18.8    40 213.  
##  2  91   276.   825.   7.1  21.9    43  70.8 
##  3  91.7 191.   636.   7.8  19.9    50  82.8 
##  4  93.5 149.   729.   8.1  27.8    27  95.2 
##  5  92.5 121.   674.   8.6  18.2    46 201.  
##  6  91.6 108.   764    6.2  18      51   0   
##  7  81.6  56.7  666.   1.9  21.9    71  54.3 
##  8  94.9 130.   587.  14.1  33.1    25  26.4 
##  9  92.2 102.   752.   8.4  24.2    27   6.58
## 10  93.3 141.   714.  13.9  18.6    49  35.9 
## # ... with 23 more rows
#describiendo los datos
library(corrplot)
library(PerformanceAnalytics)
library(lattice)
library(cptcity)
library(cluster)


describe(fires_pca)
##      vars  n   mean     sd median trimmed   mad   min     max   range  skew
## FFMC    1 33  91.92   2.81  91.70   92.29  1.48  81.6   96.00   14.40 -1.81
## DMC     2 33 143.22  57.08 130.30  138.88 32.47  47.9  276.30  228.40  0.73
## DC      3 33 682.94 119.46 692.60  695.43 53.37 100.7  825.10  724.40 -3.36
## ISI     4 33   9.13   4.15   7.80    8.74  2.08   1.9   20.00   18.10  0.94
## temp    5 33  22.42   4.80  21.90   22.33  5.49  12.9   33.10   20.20  0.17
## RH      6 33  40.61  15.02  36.00   38.93 13.34  21.0   80.00   59.00  0.90
## area    7 33 109.00 221.42  42.87   57.29 59.13   0.0 1090.84 1090.84  3.33
##      kurtosis    se
## FFMC     4.40  0.49
## DMC     -0.05  9.94
## DC      14.32 20.80
## ISI      0.28  0.72
## temp    -0.77  0.84
## RH      -0.09  2.61
## area    10.82 38.54
#ploteo


#esttandarizando los datos
fires_esc <- scale(fires_pca) %>% as_tibble() %>% 
  print()
## # A tibble: 33 x 7
##       FFMC     DMC      DC    ISI   temp      RH    area
##      <dbl>   <dbl>   <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
##  1 -0.329  -0.240   0.0808 -0.514 -0.754 -0.0404  0.469 
##  2 -0.329   2.33    1.19   -0.489 -0.107  0.159  -0.173 
##  3 -0.0797  0.844  -0.394  -0.321 -0.524  0.625  -0.119 
##  4  0.560   0.107   0.382  -0.249  1.12  -0.906  -0.0624
##  5  0.205  -0.388  -0.0715 -0.128 -0.879  0.359   0.415 
##  6 -0.115  -0.610   0.679  -0.706 -0.920  0.692  -0.492 
##  7 -3.67   -1.52   -0.145  -1.74  -0.107  2.02   -0.247 
##  8  1.06   -0.226  -0.802   1.20   2.23  -1.04   -0.373 
##  9  0.0980 -0.717   0.574  -0.177  0.372 -0.906  -0.463 
## 10  0.489  -0.0354  0.259   1.15  -0.795  0.559  -0.330 
## # ... with 23 more rows
#matriz de correlación
correlation <- cor(fires_esc) %>% 
  print()
##             FFMC         DMC          DC         ISI        temp         RH
## FFMC  1.00000000  0.32055252 -0.17227512  0.76054335  0.38085037 -0.4479377
## DMC   0.32055252  1.00000000  0.39014412  0.10424878  0.21615230 -0.2140209
## DC   -0.17227512  0.39014412  1.00000000 -0.21263329 -0.03560824 -0.3408187
## ISI   0.76054335  0.10424878 -0.21263329  1.00000000  0.47877095 -0.4167733
## temp  0.38085037  0.21615230 -0.03560824  0.47877095  1.00000000 -0.6477947
## RH   -0.44793766 -0.21402086 -0.34081874 -0.41677330 -0.64779472  1.0000000
## area  0.09032888  0.08060299  0.01856502  0.04024941  0.18000670 -0.2748217
##             area
## FFMC  0.09032888
## DMC   0.08060299
## DC    0.01856502
## ISI   0.04024941
## temp  0.18000670
## RH   -0.27482173
## area  1.00000000
#obteniendo la matríz de correlación
corrplot(correlation, 
         method = "number", 
         type = "full",
         diag = TRUE,
         tl.col = "black", 
         bg = "white", 
         title = "Correlation Matrix", 
         col = NULL)

#matriz de varianza-covarianza
matrix_cov <- cov(fires_esc) %>% print()
##             FFMC         DMC          DC         ISI        temp         RH
## FFMC  1.00000000  0.32055252 -0.17227512  0.76054335  0.38085037 -0.4479377
## DMC   0.32055252  1.00000000  0.39014412  0.10424878  0.21615230 -0.2140209
## DC   -0.17227512  0.39014412  1.00000000 -0.21263329 -0.03560824 -0.3408187
## ISI   0.76054335  0.10424878 -0.21263329  1.00000000  0.47877095 -0.4167733
## temp  0.38085037  0.21615230 -0.03560824  0.47877095  1.00000000 -0.6477947
## RH   -0.44793766 -0.21402086 -0.34081874 -0.41677330 -0.64779472  1.0000000
## area  0.09032888  0.08060299  0.01856502  0.04024941  0.18000670 -0.2748217
##             area
## FFMC  0.09032888
## DMC   0.08060299
## DC    0.01856502
## ISI   0.04024941
## temp  0.18000670
## RH   -0.27482173
## area  1.00000000
#suma de la diagonal o "varianza total"
diag(matrix_cov) %>% sum() 
## [1] 7
chart.Correlation(fires_esc, histogram = T, pch = 20)

# levelplot(
#   correlation,
#   col.regions = 
#     cpt(
#       pal = "cb_div_RdBu_11", n = 100
#     )
# )

#calculando eigenvectors y eigenvalues

eigen <- eigen(matrix_cov) %>% print()
## eigen() decomposition
## $values
## [1] 2.7314494 1.5214766 1.0291060 0.7940452 0.5463880 0.2548680 0.1226670
## 
## $vectors
##             [,1]         [,2]       [,3]         [,4]       [,5]        [,6]
## [1,] -0.48573787 -0.242090101  0.2635235  0.292683927 -0.2912026 -0.42108920
## [2,] -0.25211932  0.449221855  0.4236410  0.530073593  0.4287243 -0.01328501
## [3,] -0.03714229  0.727612240  0.1487127 -0.207889575 -0.4076672  0.32186496
## [4,] -0.47207059 -0.353132132  0.1746827  0.008223907 -0.2928079  0.65948299
## [5,] -0.46540188 -0.003302866 -0.1879590 -0.376182752  0.6523866  0.19226501
## [6,]  0.47933850 -0.247561872  0.2067998  0.396781359  0.2038618  0.47734491
## [7,] -0.17291637  0.155474898 -0.7876290  0.539558604 -0.1109033  0.13845045
##             [,7]
## [1,]  0.53686822
## [2,] -0.30034121
## [3,]  0.36616627
## [4,] -0.31813695
## [5,]  0.37943906
## [6,]  0.48921191
## [7,]  0.05455753
#calculando el pca
library(ade4)

pca <- dudi.pca(fires_esc, 
                scale = F, 
                scannf = F,
                nf = ncol(fires_esc)
)
summary(pca)
## Class: pca dudi
## Call: dudi.pca(df = fires_esc, scale = F, scannf = F, nf = ncol(fires_esc))
## 
## Total inertia: 6.788
## 
## Eigenvalues:
##     Ax1     Ax2     Ax3     Ax4     Ax5 
##  2.6487  1.4754  0.9979  0.7700  0.5298 
## 
## Projected inertia (%):
##     Ax1     Ax2     Ax3     Ax4     Ax5 
##  39.021  21.735  14.702  11.344   7.806 
## 
## Cumulative projected inertia (%):
##     Ax1   Ax1:2   Ax1:3   Ax1:4   Ax1:5 
##   39.02   60.76   75.46   86.80   94.61 
## 
## (Only 5 dimensions (out of 7) are shown)
#viendo los autovalores
pca$eig
## [1] 2.6486782 1.4753712 0.9979210 0.7699832 0.5298308 0.2471447 0.1189498
#verificando que la suma de los autovalores sea 7
sum(pca$eig)
## [1] 6.787879
#viendo los autovectores
pca$c1
##              CS1          CS2        CS3          CS4        CS5         CS6
## FFMC -0.48573787 -0.242090101 -0.2635235  0.292683927 -0.2912026 -0.42108920
## DMC  -0.25211932  0.449221855 -0.4236410  0.530073593  0.4287243 -0.01328501
## DC   -0.03714229  0.727612240 -0.1487127 -0.207889575 -0.4076672  0.32186496
## ISI  -0.47207059 -0.353132132 -0.1746827  0.008223907 -0.2928079  0.65948299
## temp -0.46540188 -0.003302866  0.1879590 -0.376182752  0.6523866  0.19226501
## RH    0.47933850 -0.247561872 -0.2067998  0.396781359  0.2038618  0.47734491
## area -0.17291637  0.155474898  0.7876290  0.539558604 -0.1109033  0.13845045
##              CS7
## FFMC  0.53686822
## DMC  -0.30034121
## DC    0.36616627
## ISI  -0.31813695
## temp  0.37943906
## RH    0.48921191
## area  0.05455753
#Scree plot 

library(factoextra)

fviz_screeplot(pca, 
               addlabels = TRUE,
               barfill = "lightblue1",
               barcolor = "lightblue1",
               linecolor= "indianred2",
               ggtheme = theme_grey())

#correlacion entre las variable soriginales y las componetes
levelplot(
  as.matrix(pca$co),
  col.regions =
    cpt(
      pal = "cb_div_RdBu_11", n = 100, rev = F
    ),
  xlab = "Variables",
  ylab = "Componentes"
)

#Varianza explicada o contribución de las variables
contrib <- as.matrix(pca$co ^ 2)

#Gráfico de contribución 

corrplot(contrib,
         method = "circle",
         tl.col = "black", 
         bg = "white", 
         title = "Contributions", 
         is.corr = F)

as.tibble(fires_esc)
## # A tibble: 33 x 7
##       FFMC     DMC      DC    ISI   temp      RH    area
##      <dbl>   <dbl>   <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
##  1 -0.329  -0.240   0.0808 -0.514 -0.754 -0.0404  0.469 
##  2 -0.329   2.33    1.19   -0.489 -0.107  0.159  -0.173 
##  3 -0.0797  0.844  -0.394  -0.321 -0.524  0.625  -0.119 
##  4  0.560   0.107   0.382  -0.249  1.12  -0.906  -0.0624
##  5  0.205  -0.388  -0.0715 -0.128 -0.879  0.359   0.415 
##  6 -0.115  -0.610   0.679  -0.706 -0.920  0.692  -0.492 
##  7 -3.67   -1.52   -0.145  -1.74  -0.107  2.02   -0.247 
##  8  1.06   -0.226  -0.802   1.20   2.23  -1.04   -0.373 
##  9  0.0980 -0.717   0.574  -0.177  0.372 -0.906  -0.463 
## 10  0.489  -0.0354  0.259   1.15  -0.795  0.559  -0.330 
## # ... with 23 more rows
head(pca$li) 
##         Axis1       Axis2      Axis3      Axis4      Axis5      Axis6
## 1  0.70987271  0.29718048  0.5023348  0.2759903 -0.4417954 -0.2702754
## 2 -0.08511407  2.09971023 -1.1817873  0.8987875  0.7350938  0.1991528
## 3  0.55635525  0.05378052 -0.5432282  0.8847904  0.4382798 -0.1346976
## 4 -1.14160982  0.48903837  0.1430041 -0.6764099  0.3540450 -0.5036471
## 5  0.57078750 -0.25178004  0.2309192  0.5653815 -0.7050283 -0.1287344
## 6  1.36309593  0.25211865 -0.3925650 -0.1487619 -0.7025230 -0.1053718
##        Axis7
## 1 -0.1913240
## 2 -0.2574199
## 3 -0.2378134
## 4  0.4671501
## 5  0.1059234
## 6  0.5569388
dim(pca$li)
## [1] 33  7
#df de componentes principales
pca_data <- as_tibble(pca$li) %>% 
  dplyr::select(sprintf("Axis%1$s", 1:3)) %>% 
  print()
## # A tibble: 33 x 3
##      Axis1   Axis2   Axis3
##      <dbl>   <dbl>   <dbl>
##  1  0.710   0.297   0.502 
##  2 -0.0851  2.10   -1.18  
##  3  0.556   0.0538 -0.543 
##  4 -1.14    0.489   0.143 
##  5  0.571  -0.252   0.231 
##  6  1.36    0.252  -0.393 
##  7  4.06    0.178   1.30  
##  8 -2.46   -1.17    0.0673
##  9 -0.332   0.285   0.116 
## 10 -0.0849 -0.538  -0.878 
## # ... with 23 more rows
library(FactoMineR)


fviz_pca_ind(pca, col.var  = "cos2", 
             geom.ind = "point",
             col.ind = "royalblue3",
             labelsize = 3,
             repel = FALSE,
             ggtheme = theme_grey())

fviz_pca_biplot(pca, col.var  = "cos2", 
             geom.var = "arrow",
             geom.ind = "point",
             repel = FALSE,
             addEllipses = TRUE,
             col.ind = "tomato2",
             ggtheme = theme_grey(),
             title = "PCA BIPLOT")

library(rgl)

pc <- plot_ly(pca_data, x = ~Axis1, y = ~Axis2, z = ~Axis3, color = "blue")
print(pc)

#Visualización en 3d 
library(pca3d)
prc <- prcomp(fires_esc, scale.=F)

pca3d(prc, components = 1:3, 
      col = "royalblue1", 
      bg = "gray31",
      axes.color = "sandybrown",
      new = TRUE)
## [1] 0.08110631 0.10661512 0.07856704
## Creating new device
rglwidget()
#distancias
pca3d(prc, components = 1:3, 
      col = "royalblue1", 
      bg = "black",
      axes.color = "sandybrown",
      show.shadows = TRUE,
      new = TRUE)
## [1] 0.08110631 0.10661512 0.07856704
## Creating new device
rglwidget()
#elipse
pca3d(prc, components = 1:3, 
      col = "royalblue1", 
      bg = "white",
      axes.color = "sandybrown",
      show.ellipses=TRUE,
      ellipse.ci=0.9, show.plane=FALSE,
      new = TRUE)
## [1] 0.08110631 0.10661512 0.07856704
## Creating new device
rglwidget()

CLUSTER JERARQUICO

#Matriz de distancias Para este caso se usará la distancia euclideana, por ser la que mejor se acomoda a los nuestros datos.

distancia<- dist(pca_data, method = "euclidean")

#Heatmap Como una visualizacion previa de lo que podria ser el numero de clusters, se desarrolla el heatmap, no sin antes haber convertido a matriz nuestra data de distancia.

d <- as.matrix(distancia)
heatmap(d)

fviz_dist(dist.obj = distancia, lab_size = 10, 
          gradient = list(low = "#FC4E07", mid = "white", high = "#00AFBB"))

#clusterizando con el metodo ward Para este caso se utilizará una técnica de clusterizacion jerarquica, con la técnica Ward D, el cual trabaja con las sumatorias de la distancia al centroide.

hmodel<- hclust(distancia, method = "ward.D")

#dendograma Se plotea el dendograma para darnos una idea de como se agrupan nuestras observaciones en cada cluster, y la cantidad de estas.

plot(hmodel)

#obteniendo las alturas Para definir el corte del dendograma, necesitamos hallar la altura del corte, para eso extraemos la columna de alturas del hmodel.

names <- "height"
heights <- as.data.frame(hmodel$height) #hay 28 alturas

#ploteando las alturas Ploteamos el grafico de alturas y observaciones y definimos que el punto de quiebre se da a partir de la observacion numero 30, lo que significa que este modelo agrupa la data en 3 clusters.

ggplot(heights, aes(x = index(heights), y = hmodel$height)) +
  geom_line(color = "royalblue2",    # Color de la linea
            lwd = 1,      # Ancho de la linea
            linetype = 1) +
  geom_vline(xintercept=30, color = "orange") +
  geom_point(color = "royalblue4")+ labs( x = "Observaciones", y = "Altura",
          title ="Visualización de alturas")+
  theme(plot.title = element_text(hjust = 0.5))

#Obteniendo el valor de la altura

(hmodel$height)[30]
## [1] 8.821526

#cortando el dendograma

clust<- cutree(hmodel, k = 3)
length(clust)
## [1] 33

#numero de elementos en cada cluster

table(clust)
## clust
##  1  2  3 
## 17  9  7

#visualizando los grupo en el dendograma

fviz_dend(hmodel, k = 3, # Cortar en 3 grupos
          cex = 0.5, # Tamaño de las etiquetas
          k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
          color_labels_by_k = TRUE, # color labels by groups
          rect = TRUE, # Add rectangle around groups
          rect_border = c("#2E9FDF", "#00AFBB", "#E7B800"),
          rect_fill = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning in if (color == "cluster") color <- "default": la condición tiene
## longitud > 1 y sólo el primer elemento será usado

#visualizando los grupos

fviz_cluster(list(data = pca_data, cluster = clust),
             palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
             ellipse.type = "convex", # Concentration ellipse
             repel = TRUE, # Avoid label overplotting (slow)
             show.clust.cent = TRUE, ggtheme = theme_minimal()
)

library(cluster)

#Uniendo el numero de cluster al data frame

col <- c("#2E9FDF", "#00AFBB", "#E7B800")
data2 <- dplyr::bind_cols(pca_data, cluster = as.numeric(clust))

#visualizando en 3d

library(plotly)
hmodel3d <- plot_ly(data2, x = ~Axis1, y = ~Axis2, z = ~Axis3,
               marker = list(color = ~cluster, colorscale = c('#FFE1A1', '#683531'), showscale = FALSE))
hmodel3d
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

K-MEANS

library(car)
library(scatterplot3d)
pca_data
## # A tibble: 33 x 3
##      Axis1   Axis2   Axis3
##      <dbl>   <dbl>   <dbl>
##  1  0.710   0.297   0.502 
##  2 -0.0851  2.10   -1.18  
##  3  0.556   0.0538 -0.543 
##  4 -1.14    0.489   0.143 
##  5  0.571  -0.252   0.231 
##  6  1.36    0.252  -0.393 
##  7  4.06    0.178   1.30  
##  8 -2.46   -1.17    0.0673
##  9 -0.332   0.285   0.116 
## 10 -0.0849 -0.538  -0.878 
## # ... with 23 more rows
# Indicadores para K óptimo

# Valor de silueta

set.seed(2021)
fviz_nbclust(
  pca_data, kmeans, method = "silhouette",
  k.max = 15
)

# Valor de suma de cuadrados totales

set.seed(2021)
fviz_nbclust(
  pca_data, kmeans, method = "wss",
  k.max = 15
)

# NbClust for determining the best number of clusters

library(NbClust)

nb <- NbClust::NbClust(pca_data,
                       distance = "euclidean",
                       min.nc = 2, max.nc = 20,
                       method = "kmeans",
                       index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 3 proposed 2 as the best number of clusters 
## * 6 proposed 3 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 2 proposed 13 as the best number of clusters 
## * 2 proposed 17 as the best number of clusters 
## * 2 proposed 19 as the best number of clusters 
## * 3 proposed 20 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
fviz_nbclust(nb)
## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 1 proposed  1 as the best number of clusters
## * 3 proposed  2 as the best number of clusters
## * 6 proposed  3 as the best number of clusters
## * 2 proposed  5 as the best number of clusters
## * 1 proposed  6 as the best number of clusters
## * 1 proposed  7 as the best number of clusters
## * 1 proposed  12 as the best number of clusters
## * 2 proposed  13 as the best number of clusters
## * 2 proposed  17 as the best number of clusters
## * 2 proposed  19 as the best number of clusters
## * 3 proposed  20 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  3 .

nb$Best.nc # Valores máximos de cada indicador
##                      KL      CH Hartigan     CCC   Scott  Marriot   TrCovW
## Number_clusters  3.0000 19.0000  13.0000 17.0000  7.0000     3.00   3.0000
## Value_Index     33.6854 64.5483  10.3022  5.6188 43.0845 26026.89 768.9477
##                  TraceW Friedman   Rubin  Cindex      DB Silhouette   Duda
## Number_clusters  3.0000  17.0000 13.0000 12.0000 20.0000    20.0000 2.0000
## Value_Index     19.0599  77.4363 -6.4272  0.2435  0.4192     0.6004 1.3669
##                 PseudoT2   Beale Ratkowsky    Ball PtBiserial Frey McClain
## Number_clusters    2.000  2.0000    5.0000  3.0000     6.0000    1  3.0000
## Value_Index       -8.053 -0.4301    0.3796 29.5099     0.5765   NA  0.5803
##                    Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 19.0000      0  5.0000      0 20.0000
## Value_Index      0.6362      0  2.1293      0  0.0072
# K-means

set.seed(2021)
c3 = kmeans(pca_data, centers = 3, iter.max = 100,
            nstart = 100)

c3$cluster
##  [1] 2 2 2 3 2 2 2 3 2 2 2 2 3 3 2 3 3 1 2 2 2 3 3 2 2 3 2 2 3 3 2 2 3
c3$centers
##        Axis1      Axis2      Axis3
## 1  1.9041828 -5.3307561  0.0413673
## 2  0.9444667  0.3795228 -0.1653002
## 3 -1.7327931 -0.1883083  0.2720531
# Silueta por cada cluster

silueta <- cluster::silhouette(c3$cluster, dist(pca_data))

fviz_silhouette(silueta)
##   cluster size ave.sil.width
## 1       1    1          0.00
## 2       2   20          0.39
## 3       3   12          0.34

#Gráfico de clusters basados en sus centroides

centroides_c3 = data.frame(c3$centers)

centroides_c3$kmeans3 = 1:3

library(reshape)
centroides_km3_2 = reshape::melt(centroides_c3, id=c("kmeans3"))

str(centroides_km3_2)
## 'data.frame':    9 obs. of  3 variables:
##  $ kmeans3 : int  1 2 3 1 2 3 1 2 3
##  $ variable: Factor w/ 3 levels "Axis1","Axis2",..: 1 1 1 2 2 2 3 3 3
##  $ value   : num  1.904 0.944 -1.733 -5.331 0.38 ...
library(ggplot2)
ggplot(centroides_km3_2, aes(x=kmeans3, y=value, fill=variable))+
  geom_bar(stat="identity", position="dodge")

# Cluster con PCA

cluster::clusplot(pca_data, c3$cluster, color=T, labels=2) 

# * grafico en 3D ----

pc = princomp(fires_pca_esc, cor = T, scores = T)
ls(pc)
## [1] "call"     "center"   "loadings" "n.obs"    "scale"    "scores"   "sdev"
head(pc$scores)
##           Comp.1     Comp.2     Comp.3     Comp.4     Comp.5      Comp.6
## [1,] -0.99476586 -0.7256432  0.2187196  1.2686308  0.3459760 -0.04017855
## [2,]  0.08133548 -1.8648166  0.1888528 -1.7759907  0.8409043 -0.37866522
## [3,] -0.49523818  0.1177687  0.1668085 -0.7612738  0.8659261 -0.31701276
## [4,]  1.01305336 -0.7758514  0.1538842  0.5043303 -0.6645919 -0.65028613
## [5,] -0.91419220 -0.3724056  0.7927401  1.5892586  0.6641811  0.09232420
## [6,] -1.07059169  0.4039000 -0.4879735 -1.3309747 -0.2047690  1.17963038
##           Comp.7      Comp.8
## [1,] -0.07337053 -0.23629428
## [2,] -0.04677559 -0.25668831
## [3,]  0.21439304 -0.21182802
## [4,]  0.15620701  0.47404577
## [5,] -0.35382580  0.02508472
## [6,]  0.75853243  0.71420830
class(head(pc$scores))
## [1] "matrix" "array"
pca_data
## # A tibble: 33 x 3
##      Axis1   Axis2   Axis3
##      <dbl>   <dbl>   <dbl>
##  1  0.710   0.297   0.502 
##  2 -0.0851  2.10   -1.18  
##  3  0.556   0.0538 -0.543 
##  4 -1.14    0.489   0.143 
##  5  0.571  -0.252   0.231 
##  6  1.36    0.252  -0.393 
##  7  4.06    0.178   1.30  
##  8 -2.46   -1.17    0.0673
##  9 -0.332   0.285   0.116 
## 10 -0.0849 -0.538  -0.878 
## # ... with 23 more rows
summary(pc)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     1.6712591 1.2667889 1.0307082 0.9790796 0.89086371
## Proportion of Variance 0.3491384 0.2005943 0.1327949 0.1198246 0.09920477
## Cumulative Proportion  0.3491384 0.5497326 0.6825275 0.8023522 0.90155693
##                            Comp.6     Comp.7     Comp.8
## Standard deviation     0.69781481 0.42347941 0.34823017
## Proportion of Variance 0.06086819 0.02241685 0.01515803
## Cumulative Proportion  0.96242512 0.98484197 1.00000000
biplot(pc)

#install.packages('rgl')
library(rgl)

plot3d(pc$scores[, 1:3], col = c4$cluster, size = 10)
rglwidget() # grafico aparece en Viewer
c3_cluster_f = as.factor(c3$cluster) #En factor para graficar con scatter
c3_cluster_f
##  [1] 2 2 2 3 2 2 2 3 2 2 2 2 3 3 2 3 3 1 2 2 2 3 3 2 2 3 2 2 3 3 2 2 3
## Levels: 1 2 3
scatter3d(x = pc$scores[,1], y = pc$scores[,2], z = pc$scores[,3],
          xlab = "PC1", ylab = "PC2" , zlab = "PC3",
          groups = c3_cluster_f, surface=FALSE)
rglwidget() # grafico aparece en Viewer
# Validación

library(clusterSim)

# Indice Davies - Boulding

val <- c3$cluster

index <- index.DB(pca_data, val, centrotypes = "centroids")
index$DB
## [1] 0.8492732
# Indice Dunn

library(clValid)
dunn(distance = NULL, Data = pca_data, clusters = val)
## [1] 0.05618022
# Variables originales

c3$cluster
##  [1] 2 2 2 3 2 2 2 3 2 2 2 2 3 3 2 3 3 1 2 2 2 3 3 2 2 3 2 2 3 3 2 2 3
fires_cluster <- 
  fires_pca %>% 
  mutate(
    clust = c3$cluster
  )

fires_cluster
## # A tibble: 33 x 8
##     FFMC   DMC    DC   ISI  temp    RH   area clust
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <int>
##  1  91   130.   693.   7    18.8    40 213.       2
##  2  91   276.   825.   7.1  21.9    43  70.8      2
##  3  91.7 191.   636.   7.8  19.9    50  82.8      2
##  4  93.5 149.   729.   8.1  27.8    27  95.2      3
##  5  92.5 121.   674.   8.6  18.2    46 201.       2
##  6  91.6 108.   764    6.2  18      51   0        2
##  7  81.6  56.7  666.   1.9  21.9    71  54.3      2
##  8  94.9 130.   587.  14.1  33.1    25  26.4      3
##  9  92.2 102.   752.   8.4  24.2    27   6.58     2
## 10  93.3 141.   714.  13.9  18.6    49  35.9      2
## # ... with 23 more rows
table(fires_cluster$clust)
## 
##  1  2  3 
##  1 20 12

CLUSTER HKMEANS

#Hkmeans -----------------
# Compute hierarchical k-means clustering
library(factoextra)

hkmodel <-hkmeans(pca_data, 3)
hkmodel$centers
##        Axis1      Axis2      Axis3
## 1  0.3505248  0.4866362 -0.3133842
## 2  2.4338417 -0.8623223  0.3353830
## 3 -1.8374042 -0.2374768  0.2728954
# Elements returned by hkmeans()
names(hkmodel)
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "data"        
## [11] "hclust"
# Visualize the tree
fviz_dend(hkmodel, cex = 0.6, palette = "jco",
          rect = TRUE, rect_border = "jco", rect_fill = TRUE)

# Visualize the hkmeans final clusters

fviz_cluster(hkmodel,
             palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
             ellipse.type = "convex", # Concentration ellipse
             repel = TRUE, # Avoid label overplotting (slow)
             show.clust.cent = TRUE, 
             ggtheme = theme_minimal()
)

length(hkmodel$cluster)
## [1] 33
head(hkmodel$data)
## # A tibble: 6 x 3
##     Axis1   Axis2  Axis3
##     <dbl>   <dbl>  <dbl>
## 1  0.710   0.297   0.502
## 2 -0.0851  2.10   -1.18 
## 3  0.556   0.0538 -0.543
## 4 -1.14    0.489   0.143
## 5  0.571  -0.252   0.231
## 6  1.36    0.252  -0.393
table(hkmodel$cluster)
## 
##  1  2  3 
## 16  6 11
library(plotly)
datahk <- dplyr::bind_cols(hkmodel$data, cluster = as.numeric(clust))

hkmodel <- plot_ly(datahk, x = ~Axis1, y = ~Axis2, z = ~Axis3,
               marker = list(color = ~cluster, colorscale = c('#FFE1A1', '#683531'), showscale = FALSE))
hkmodel